## Script for Dictionary of Constituency Language in Speech 
## JBS 3 August 2017

rm(list=ls())

library(quanteda)
library(readtext)
library(data.table)
library(reshape2)
library(tidyr)
library(stringr)
library(plyr)
library(MASS)
library(readstata13)


#### 
## 2005-2010
####
# set path here

path <- "~/Dropbox (Personal)/Rebel Summaries/APSR_SKLLO_RepFiles"

setwd(path)
# Read in rebel vote info.

dta.0510 <- read.csv(paste(path,"/RawData/2005-2010/rebels.csv", sep=""))

# Transform to long format (MP-Vote)
data_long <- gather(dta.0510, votenum, rebel, X2005.05.24_1: X2010.04.08_135)

# Fix Division names for merging
data_long$Division <- gsub("X","",data_long$votenum)
data_long$Division <- gsub("[.]","-",data_long$Division)

# Load in vote info with 2nd reading info
divinfo <- read.csv(paste(path,"/CalcData/divisions_wFreeVote_reading_0510.csv",sep=""))

# Get rid of unneeded columns in divinfo
divinfo <- divinfo[,2:11]

data_long <- join(data_long, divinfo, by="Division")

data_long.lab <- data_long[data_long$Party=="Lab",]
data_long.con <- data_long[data_long$Party=="Con",]

# Read speech text file
speechtext <- fread(paste(path,"/RawData/2005-2010/speechTEXT.csv",sep=""),sep=",")

outdata.speech <- speechtext[speechtext$Substantive == 1, ] 
  
### Collapse multiple speeches on a single division into one. First create and MP by Division identifier 
### Drop duplicated speeches first (occurs because sometimes a single speech is tied to more than one division if votes happen in same timeframe)
outdata.speech <- outdata.speech[!duplicated(outdata.speech$Text),]
  
outdata.speech$MPDiv <- as.factor(paste(outdata.speech$PubWhipID, outdata.speech$Division, sep=";"))

collapsedtext <- by(outdata.speech$Text, outdata.speech$MPDiv, paste, collapse=" " )

# Clean names for merging
PubWhipID <- sub(";.+","",names(collapsedtext) ,perl=T)
Division <- sub("\\d+;","",names(collapsedtext) ,perl=T)
  
# Create dataframe of collapsed text with division info
TextCollapse <- data.frame(PubWhipID=as.numeric(PubWhipID),
                             Division=as.factor(Division),
                             Text=as.vector(collapsedtext),
                             stringsAsFactors=FALSE)
 
# Merge keeping only votes in the votes subset
dtaLab <- join(TextCollapse, data_long.lab, by=c("PubWhipID","Division"), type="right")
dtaCon <- join(TextCollapse, data_long.con, by=c("PubWhipID","Division"), type="right")

  
dtaLab$rebel[is.na(dtaLab$rebel)==T] <- 0
dtaLab <- dtaLab[is.na(dtaLab$Text)==F,]
dtaLab <- dtaLab[dtaLab$FreeVote==0,]

dtaCon$rebel[is.na(dtaCon$rebel)==T] <- 0
dtaCon <- dtaCon[is.na(dtaCon$Text)==F,]
dtaCon <- dtaCon[dtaCon$FreeVote==0,]


# Create Constituency Dictionary
myDict <- dictionary(list(const = c("my constituen*","I represent","where I live","my area","my surgery","my voters","my elector*")))

names(dtaLab)[3] <- "text"
names(dtaCon)[3] <- "text"

# Make a corpus
txtcorpusL <- corpus(dtaLab)
txtcorpusC <- corpus(dtaCon)

# subset corpus to only debates with a rebellion

txtcorpusLreb <- corpus_subset(txtcorpusL,Rebels>1)
txtcorpusLnoreb <- corpus_subset(txtcorpusL,Rebels==0)


# Make a dfm using constituency dictionary
constdfmL <- dfm(txtcorpusL,dictionary=myDict)
constdfmC <- dfm(txtcorpusC,dictionary=myDict)

constdfmLR <- dfm(txtcorpusLreb,dictionary=myDict)
constdfmLNR <- dfm(txtcorpusLnoreb,dictionary=myDict)

constdfmLmat <- convert(constdfmL,"matrix")
constdfmCmat <- convert(constdfmC,"matrix")

sum(constdfmLmat[,1]>0)/ndoc(constdfmL)
sum(constdfmCmat[,1]>0)/ndoc(constdfmC)

constdfmLRmat <- convert(constdfmLR,"matrix")
constdfmLNRmat <- convert(constdfmLNR,"matrix")

sum(constdfmLRmat[,1]>0)/ndoc(constdfmLR)
sum(constdfmLNRmat[,1]>0)/ndoc(constdfmLNR)

#############################
### 2010-15
###
rm(list=ls())

path <- "~/Dropbox (Personal)/Rebel Summaries/APSR_SKLLO_RepFiles"

setwd(path)
# Read in rebel vote info.

dta.1015 <- read.csv(paste(path,"/RawData/2010-2015/rebels.csv", sep=""))

# Transform to long format (MP-Vote)
data_long <- gather(dta.1015, votenum, rebel, X2010.06.07_1: X2015.03.26_188)

# Fix Division names for merging
data_long$Division <- gsub("X","",data_long$votenum)
data_long$Division <- gsub("[.]","-",data_long$Division)

# Load in vote info with 2nd reading info
divinfo <- read.csv(paste(path,"/CalcData/divisions_wFreeVote_reading_1015.csv",sep=""))

# Get rid of unneeded columns in divinfo
divinfo <- divinfo[,2:11]

data_long <- join(data_long, divinfo, by="Division")

data_long.lab <- data_long[data_long$Party=="Lab",]
data_long.con <- data_long[data_long$Party=="Con",]

# Read speech text file
speechtext <- fread(paste(path,"/RawData/2010-2015/speechTEXT.csv",sep=""),sep=",")

outdata.speech <- speechtext[speechtext$Substantive == 1, ] 
  
### Collapse multiple speeches on a single division into one. First create and MP by Division identifier 
### Drop duplicated speeches first (occurs because sometimes a single speech is tied to more than one division if votes happen in same timeframe)
outdata.speech <- outdata.speech[!duplicated(outdata.speech$Text),]
  
outdata.speech$MPDiv <- as.factor(paste(outdata.speech$PubWhipID, outdata.speech$Division, sep=";"))

collapsedtext <- by(outdata.speech$Text, outdata.speech$MPDiv, paste, collapse=" " )

# Clean names for merging
PubWhipID <- sub(";.+","",names(collapsedtext) ,perl=T)
Division <- sub("\\d+;","",names(collapsedtext) ,perl=T)
  
# Create dataframe of collapsed text with division info
TextCollapse <- data.frame(PubWhipID=as.numeric(PubWhipID),
                             Division=as.factor(Division),
                             Text=as.vector(collapsedtext),
                             stringsAsFactors=FALSE)
 
# Merge keeping only votes in the votes subset
dtaLab <- join(TextCollapse, data_long.lab, by=c("PubWhipID","Division"), type="right")
dtaCon <- join(TextCollapse, data_long.con, by=c("PubWhipID","Division"), type="right")

  
dtaLab$rebel[is.na(dtaLab$rebel)==T] <- 0
dtaLab <- dtaLab[is.na(dtaLab$Text)==F,]
dtaLab <- dtaLab[dtaLab$FreeVote==0,]

dtaCon$rebel[is.na(dtaCon$rebel)==T] <- 0
dtaCon <- dtaCon[is.na(dtaCon$Text)==F,]
dtaCon <- dtaCon[dtaCon$FreeVote==0,]


# Create Constituency Dictionary
myDict <- dictionary(list(const = c("my constituen*","I represent","where I live","my area","my surgery","my voters","my elector*")))

names(dtaLab)[3] <- "text"
names(dtaCon)[3] <- "text"

# Make a corpus
txtcorpusL <- corpus(dtaLab)
txtcorpusC <- corpus(dtaCon)

# subset corpus to only debates with a rebellion

txtcorpusLreb <- corpus_subset(txtcorpusL,Rebels>1)
txtcorpusLnoreb <- corpus_subset(txtcorpusL,Rebels==0)


# Make a dfm using constituency dictionary
constdfmL <- dfm(txtcorpusL,dictionary=myDict)
constdfmC <- dfm(txtcorpusC,dictionary=myDict)

constdfmLR <- dfm(txtcorpusLreb,dictionary=myDict)
constdfmLNR <- dfm(txtcorpusLnoreb,dictionary=myDict)

constdfmLmat <- convert(constdfmL,"matrix")
constdfmCmat <- convert(constdfmC,"matrix")

sum(constdfmLmat[,1]>0)/ndoc(constdfmL)
sum(constdfmCmat[,1]>0)/ndoc(constdfmC)

constdfmLRmat <- convert(constdfmLR,"matrix")
constdfmLNRmat <- convert(constdfmLNR,"matrix")

sum(constdfmLRmat[,1]>0)/ndoc(constdfmLR)
sum(constdfmLNRmat[,1]>0)/ndoc(constdfmLNR)

